elk = read_csv("clean_data/elk.csv") |>
mutate(elk_id = factor(elk_id))
We begin exploratory data analysis by examining the geographic elk data. We examine the latitude and longitude for the individual elk each year. We find the elk with the furthest ranging longitude. We can see that the elk with the furthest ranging longitude are elk 900, 917, and 916. Elk 917 shows up in this list 3 times for the years 2013, 2014, and 2015. This elk is ranging farther each year.
elk |>
group_by(elk_id, year) |>
summarize(longitude_range = max(long) - min(long)) |>
arrange(desc(longitude_range)) |>
head(n= 10) |>
knitr::kable(col.names = c("Elk ID", "Year", "longitude Range"))
| Elk ID | Year | longitude Range |
|---|---|---|
| 917 | 2013 | 0.7775239 |
| 918 | 2014 | 0.5520892 |
| 706 | 2013 | 0.5067580 |
| 706 | 2012 | 0.4919770 |
| 916 | 2013 | 0.4790952 |
| 918 | 2013 | 0.4757999 |
| 917 | 2014 | 0.4695007 |
| 914 | 2013 | 0.4680962 |
| 913 | 2014 | 0.4584522 |
| 911 | 2014 | 0.4541982 |
Looking at the furthest ranging latitudes, we again see elk 917 ranging furthest in the years 2013, 2014, and 2015 with the range going up each year.
elk |>
group_by(elk_id, year) |>
summarize(latitude_range = max(lat) - min(lat)) |>
arrange(desc(latitude_range)) |>
head(n= 10) |>
knitr::kable(col.names = c("Elk ID", "Year", "latitude Range"))
| Elk ID | Year | latitude Range |
|---|---|---|
| 900 | 2009 | 0.8055470 |
| 917 | 2015 | 0.8034336 |
| 916 | 2014 | 0.8032785 |
| 917 | 2014 | 0.8013303 |
| 654 | 2011 | 0.7895229 |
| 916 | 2013 | 0.7861957 |
| 914 | 2014 | 0.7851894 |
| 654 | 2010 | 0.7849657 |
| 917 | 2013 | 0.7742691 |
| 911 | 2013 | 0.7731564 |
Looking at total elk movement by month, we see see a clear relationship. The months of May through October see consistent monthly travel of approximately 100 kilometers. The months of March, April, November, and December see highly varied travel distances, ranging from tens of kilometers to over three hundred.
keeper
elk |>
mutate(month = month(dt, label = TRUE)) |>
group_by(elk_id, year, month) |>
summarize(
total_distance_km = sum(dist_km, na.rm = TRUE)
) |>
ggplot(aes(x = factor(month), y = total_distance_km, fill = factor(month))) +
geom_violin() +
labs(
title = "Total Movement by Month",
x = "Month",
y = "Total Distance Traveled (km)",
fill = "Month") +
theme(legend.position = 'none')

The elk have a consistent median travel distance of approximately 2.9 kilometers per day, though they may each travel up to 10 kilometers today.
keep
daily_median =
elk |>
mutate(date = date(dt)) |>
group_by(elk_id, date) |>
summarize(daily_sum = sum(dist_km)) |>
drop_na() |>
pull(daily_sum) |>
median()
elk |>
mutate(date = date(dt)) |>
group_by(elk_id, date) |>
summarize(daily_sum = sum(dist_km)) |>
ggplot(aes(x = elk_id, y = daily_sum)) +
geom_boxplot(outliers = FALSE) +
geom_hline(yintercept = daily_median, color = "blue") +
labs(title = "Daily Averages (km)",
x = "Elk ID",
y = "Distance (km)")

Now let’s look at how far they travel throughout the year.
#calculate the distance they travel each month
elk_seasonal =
elk |>
mutate(
season = case_when(
month %in% c(12, 1, 2) ~ "Winter",
month %in% c(3, 4, 5) ~ "Spring",
month %in% c(6, 7, 8) ~ "Summer",
month %in% c(9, 10, 11) ~ "Fall"
)
) |>
arrange(elk_id, dt) # Ensure data is ordered
# Calculate distances and group by year, season, and elk_id
elk_distances_seasonal =
elk_seasonal |>
group_by(elk_id, year, season) |>
summarize(
total_distance_km = sum(dist_km, na.rm = TRUE)
) |>
ungroup()
# View results
elk_distances_seasonal |>
pivot_wider(names_from = season, values_from = total_distance_km) |>
knitr::kable(col.names = c("Elk ID", "Year", "Fall", "Spring", "Summer", "Winter"))
| Elk ID | Year | Fall | Spring | Summer | Winter |
|---|---|---|---|---|---|
| 572 | 2006 | 358.1692 | 243.4525102 | 270.19248 | 115.6640 |
| 572 | 2007 | 262.7980 | 307.3771264 | 296.25650 | 266.9184 |
| 572 | 2008 | NA | 0.5268033 | NA | 102.2457 |
| 595 | 2007 | 309.8790 | 332.3771542 | 286.95792 | 209.7931 |
| 595 | 2008 | NA | 44.7942304 | NA | 147.0538 |
| 654 | 2010 | 458.5519 | 377.0839079 | 276.17203 | 105.0326 |
| 654 | 2011 | 248.2131 | 610.7469717 | 355.08460 | 271.0322 |
| 656 | 2010 | 101.2863 | 259.1596272 | 196.20043 | NA |
| 671 | 2012 | 297.2838 | 322.6824827 | 253.54017 | 126.0258 |
| 671 | 2013 | 257.3041 | 352.6952899 | 230.10628 | 261.9005 |
| 671 | 2014 | NA | NA | NA | 123.6739 |
| 706 | 2012 | 349.3021 | 560.6219499 | 394.64256 | 252.7928 |
| 706 | 2013 | 399.1241 | 594.4674206 | 466.60326 | 404.6597 |
| 706 | 2014 | NA | 84.5958876 | NA | 244.3947 |
| 900 | 2008 | 444.8889 | NA | 173.34661 | 191.9041 |
| 900 | 2009 | 410.9002 | 483.7385340 | 377.85262 | 306.8446 |
| 900 | 2010 | NA | 186.4186971 | NA | 158.6582 |
| 902 | 2008 | 531.6827 | NA | 171.93479 | 251.8201 |
| 902 | 2009 | 464.3840 | 522.5132624 | 345.27205 | 411.7242 |
| 902 | 2010 | NA | 179.1117077 | NA | 169.1765 |
| 903 | 2008 | 405.1701 | NA | 253.70199 | 152.7498 |
| 903 | 2009 | 378.3071 | 463.3102610 | 422.34843 | 275.0602 |
| 903 | 2010 | NA | 154.3125271 | NA | 147.2773 |
| 907 | 2013 | 456.0894 | NA | 95.12488 | 185.9591 |
| 907 | 2014 | 418.9940 | 575.0174816 | 269.30745 | 361.2967 |
| 907 | 2015 | NA | 457.7967286 | 25.33632 | 228.5525 |
| 909 | 2013 | 488.8248 | NA | 116.90919 | 248.0778 |
| 909 | 2014 | 313.6993 | 631.8496976 | 253.46676 | 437.7405 |
| 909 | 2015 | NA | 567.1980986 | 131.50990 | 292.2471 |
| 911 | 2013 | 372.7446 | NA | 114.29155 | 206.5388 |
| 911 | 2014 | 213.2257 | 613.1354415 | 206.22425 | 382.6545 |
| 913 | 2013 | 287.2265 | NA | 107.38545 | 186.2400 |
| 913 | 2014 | 311.1236 | 459.6227272 | 229.70137 | 313.1485 |
| 913 | 2015 | NA | 482.2381958 | 199.57717 | 134.8874 |
| 914 | 2013 | 419.0289 | NA | 93.43759 | 109.7464 |
| 914 | 2014 | 242.4557 | 456.2527138 | 199.99580 | 221.3847 |
| 916 | 2013 | 310.0486 | NA | 146.50213 | 282.4045 |
| 916 | 2014 | 305.6969 | 592.8954920 | 312.35485 | 476.8180 |
| 917 | 2013 | 330.8002 | NA | 91.84982 | 166.5031 |
| 917 | 2014 | 279.0076 | 538.8104139 | 269.01899 | 268.2958 |
| 917 | 2015 | NA | 408.4196419 | 192.93056 | 214.6738 |
| 918 | 2013 | 238.1352 | NA | 127.69952 | 143.4284 |
| 918 | 2014 | 288.7280 | 524.7055873 | 235.35097 | 309.8905 |
Now I am curious what the overlap of the elk are? The data collected is between 2006 and 2015, but we do not have data from all of the elk ids during that time period. It turns out that wild elk can only live up to 10-12 years, according to worlddeer.ord. So let’s find out what the time range is for each of these elk.
One thing I want to check is if the number of data points for each elk is skewing the data. Let’s see how the number of data points spread across the time frame.
Maybe
elk |>
group_by(elk_id) |>
summarize(
total_distance_km = sum(dist_km, na.rm = TRUE)
) |>
knitr::kable(digits = 0)
| elk_id | total_distance_km |
|---|---|
| 572 | 2224 |
| 595 | 1331 |
| 654 | 2702 |
| 656 | 557 |
| 671 | 2225 |
| 706 | 3751 |
| 900 | 2735 |
| 902 | 3048 |
| 903 | 2652 |
| 907 | 3073 |
| 909 | 3482 |
| 911 | 2109 |
| 913 | 2711 |
| 914 | 1742 |
| 916 | 2427 |
| 917 | 2760 |
| 918 | 1868 |
# And maybe let's check how many data points we have
elk |>
ggplot(aes(x = elk_id, fill = factor(year))) +
geom_bar() +
labs(title = "Total Data Points for Each Elk",
x = "Elk ID",
y = "# of data points",
fill = "Year")
We can see that the data is not consistent for for all of the elk across
the 2006-2015 time period. However, there is a lot of overlap for 2
groups of elk. In further analysis we will use the 8 elk that overlap in
2013 and 2014.
elk |>
group_by(elk_id) |>
summarize(start_time = min(dt),
end_time = max(dt)) |>
knitr::kable()
| elk_id | start_time | end_time |
|---|---|---|
| 572 | 2006-03-01 18:01:00 | 2008-03-01 06:00:00 |
| 595 | 2007-01-16 18:01:00 | 2008-03-08 14:00:00 |
| 654 | 2010-03-16 15:00:00 | 2011-11-17 07:00:00 |
| 656 | 2010-03-16 15:00:00 | 2010-10-25 17:02:00 |
| 671 | 2012-03-22 16:00:00 | 2014-02-26 06:01:00 |
| 706 | 2012-01-24 10:30:00 | 2014-03-15 07:30:00 |
| 900 | 2008-07-13 00:01:00 | 2010-03-31 16:00:00 |
| 902 | 2008-07-09 06:00:00 | 2010-04-01 10:00:00 |
| 903 | 2008-07-09 04:01:00 | 2010-04-01 12:00:00 |
| 907 | 2013-07-16 10:01:00 | 2015-06-18 06:00:00 |
| 909 | 2013-07-16 10:01:00 | 2015-07-30 10:00:00 |
| 911 | 2013-07-16 18:00:00 | 2014-12-22 10:00:00 |
| 913 | 2013-07-16 14:01:00 | 2015-08-25 06:00:00 |
| 914 | 2013-07-16 14:00:00 | 2014-12-22 10:00:00 |
| 916 | 2013-07-16 14:01:00 | 2014-12-22 10:00:00 |
| 917 | 2013-07-16 18:01:00 | 2015-08-13 10:01:00 |
| 918 | 2013-07-16 18:00:00 | 2014-12-18 12:00:00 |
elk_df_2013.2014 =
elk |>
filter(dt >= as_date("2013-07-16") &
dt <= as_date("2014-12-30"),
elk_id %in% c(907, 909, 911, 913, 914, 916, 917, 918))
The most overlapping data occurs between July 16th 2013 and December 30th, 2014. We have 8 elk that have data for this time range: 907, 909, 911, 913, 914, 916, 917, 918. Now let’s see how they move around!
Plot the seasonal data
elk_monthly = elk_df_2013.2014 |>
mutate(
month = month(dt, label = TRUE),
) |>
arrange(elk_id, dt)
# Calculate distances and group by year, season, and elk_id
elk_distance_monthly =
elk_monthly |>
group_by(elk_id, year, month) |>
summarize(
total_distance_km = sum(dist_km, na.rm = TRUE)
) |>
ungroup()
elk_distance_monthly |>
ggplot(aes(x = as.numeric(month), y = total_distance_km, color = elk_id)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_continuous(
breaks = 1:12, # Numeric positions for each month
labels = month.name # Use month names as labels
) +
labs(title = "Elk Movement by Month 2013-07-16 to 2014-12-30",
x = "month",
y = "total distance (km)")
It looks like May and December are when the elk are moving around the
most.
I want to check if the number of observations for each month has an impact on these trends. I’m going to see if normalizing the data based on the number of observations has a significant impact on the trends.
elk_distance_normalized =
elk_monthly |>
group_by(elk_id, year, month) |>
summarize(
total_distance_km = sum(dist_km, na.rm = TRUE),
count = n()
) |>
mutate(normal_distance = total_distance_km/count)
elk_distance_normalized |>
ggplot(aes(x = as.numeric(month), y = normal_distance, color = elk_id)) +
geom_point() +
geom_smooth(se = FALSE) +
scale_x_continuous(
breaks = 1:12, # Numeric positions for each month
labels = month.name # Use month names as labels
) +
labs(title = "Elk Movement by Month 2013-07-16 to 2014-12-30",
x = "month",
y = "normalized distance")
The normalized data has a similar trend so we will continue with the raw
data in km.
Let’s see where the elk are moving around in the month of May. Below is the elk mapped in Yellowstone park with their start and end points.
# Let's make a custom elk icon!!
elk_icon <- makeIcon(
iconUrl = "pics/elk_icon.png", # Replace with the URL of your moose image
iconWidth = 30, iconHeight = 30
)
filtered_data = elk_df_2013.2014 |>
filter(month == 5)
# Create a color palette (limited to 9 elk IDs for "Set1")
elk_ids = unique(filtered_data$elk_id) # Get unique elk IDs
num_colors = length(elk_ids) # Ensure we don't exceed palette limit
path_colors = colorFactor(palette = RColorBrewer::brewer.pal(num_colors, "Set1"), domain = elk_ids)
# Initialize leaflet map
map <- filtered_data |>
group_by(elk_id) |>
summarize(start_long = first(long), start_lat = first(lat),
end_long = last(long), end_lat = last(lat))|>
ungroup() |>
leaflet() |>
addProviderTiles(providers$CartoDB.Positron, group = "Base Map") |>
addProviderTiles(providers$Esri.NatGeoWorldMap, group = "NatGeo Map") |>
addMarkers(lng = ~start_long, lat = ~start_lat, icon = elk_icon, popup = ~paste("Start Point: Elk", elk_id)) |>
addMarkers(lng = ~end_long, lat = ~end_lat, icon = elk_icon, popup = ~paste("End Point: Elk ", elk_id))
# Add lines for each elk's path
for (id in elk_ids) {
elk_data <- filtered_data |> filter(elk_id == id) # Subset data for each elk
map <- map |>
addPolylines(
data = elk_data,
lng = ~long, lat = ~lat,
color = path_colors(id), # Assign unique color for each elk
weight = 2,
opacity = 0.8,
label = ~paste("Elk ID:", id) # Label showing elk ID
)
}
# Add a legend for the elk IDs
map <- map |>
addLegend(
position = "topright",
pal = path_colors,
values = elk_ids,
title = "Elk ID"
)
# Print the map
map
<<<<<<< HEAD
=======
>>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37
From the leaflet we can tell that the elk all started somewhat close together and migrated northward. They all followed a similar path up to Jackson Lake and then some started to choose different paths. Elk 916 decided they liked it and stayed close to the lake for the rest of the month. Elk 911, 914, and 917 seemed to stay together from start until end. Maybe they are in the same heard. Cute!
Now let’s look at the paths for December because that was also a big movement month. There appears to be one elk, elk id 911 that traveled between Yellowstone National Park and the National elk refuge, whereas no other elk made the same journey in that month.
<<<<<<< HEADfiltered_data = elk_df_2013.2014 |>
filter(month == 12,
year == 2014)
# Initialize leaflet map
map <- filtered_data |>
group_by(elk_id) |>
summarize(start_long = first(long), start_lat = first(lat),
end_long = last(long), end_lat = last(lat))|>
ungroup() |>
leaflet() |>
addProviderTiles(providers$CartoDB.Positron, group = "Base Map") |>
addProviderTiles(providers$Esri.NatGeoWorldMap, group = "NatGeo Map") |>
addMarkers(lng = ~start_long, lat = ~start_lat, icon = elk_icon, popup = ~paste("Start Point: Elk", elk_id)) |>
addMarkers(lng = ~end_long, lat = ~end_lat, icon = elk_icon, popup = ~paste("End Point: Elk ", elk_id))
# Add lines for each elk's path
for (id in elk_ids) {
elk_data <- filtered_data |> filter(elk_id == id) # Subset data for each elk
map <- map |>
addPolylines(
data = elk_data,
lng = ~long, lat = ~lat,
color = path_colors(id), # Assign unique color for each elk
weight = 2,
opacity = 0.8,
label = ~paste("Elk ID:", id) # Label showing elk ID
)
}
# Add a legend for the elk IDs
map <- map |>
addLegend(
position = "topright",
pal = path_colors,
values = elk_ids,
title = "Elk ID"
)
# Print the map
map
>>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37
We read in the combined data set from all_data.csv. This
data set contains the same geographic data as the elk.csv
data along with land cover data, temperature, and water quality readings
in that same geographic area. The data processing steps to create this
file can be found in the data cleaning page. T
all_data = read_csv('clean_data/all_data.csv')
We also read in the land cover data for the entire region. The yellow regions of the map are fully covered, many cases by water or snow. The large yellow regions represent Jackson lake, Yellowstone Lake, and Heart Lake. The green areas represent land covered by an abundance of foliage or smaller bodies of water, such as creeks. The dark blue and purple regions represent land with scant foliage cover, such as large rocks.
small_land_coord = rast('clean_data/land_cover.tif')
plot(small_land_coord)

Patchwork combine these
Below is a histogram of the distribution of land cover in the entire region. The most common land cover value is 137, representing sparse vegetation. The second most common value is 500, representing thick vegetation or grasses. The maximum value of 583 represents land covered by water.
land_coord_df = as.data.frame(small_land_coord)
land_coord_df |>
ggplot(aes(x = land_cover)) +
geom_histogram()

Here is the distribution of land cover where elk were observed. It is clear that elk spend more of their time in land with cover between 300 and 500, but this plot is confounded by the distribution of land cover data in the region. In other words, there is much low-cover land, and elk may be forced to spend more time there than they would otherwise prefer.
all_data |>
ggplot(aes(x = land_cover)) +
geom_histogram()

In order to get a better understanding of the elk’s habits, we will plot the relative frequency of the time that they spend at each land cover value. We begin by binning each land cover value for the region and the elk to find the densities. We divide the elk density by the regional density of each bin. Plotting, we see that elk prefer to spend their time in the regions of middling land cover.
# Define bin breaks
land_cover_bins <- seq(min(land_coord_df$land_cover), max(land_coord_df$land_cover), length.out = 30)
# Bin the vectors
region_density <- cut(land_coord_df$land_cover, land_cover_bins, right = FALSE, labels = FALSE)
elk_density <- cut(all_data$land_cover, land_cover_bins, right = FALSE, labels = FALSE)
# Calculate sums within each bin
sum1 <- tapply(land_coord_df$land_cover, region_density, sum, na.rm = TRUE) / sum(land_coord_df$land_cover)
sum2 <- tapply(all_data$land_cover, elk_density, sum, na.rm = TRUE) / sum(all_data$land_cover)
# Divide the sums of corresponding bins
relative_land_cover =
data.frame(
land_cover_bins = land_cover_bins[-1],
density_elk = sum2 / sum1)
relative_land_cover |>
ggplot( aes(x = land_cover_bins, y = density_elk)) +
geom_bar(stat = "identity") +
labs(title = "Relative Density of Elk by Land Cover", x = "Land Cover", y = "Elk Density")

In this plot, we see that the elk spend most of their time in the low lying, well vegetated areas of Yellowstone.
ggplot() +
geom_spatraster(data = small_land_coord) +
scale_fill_gradient2(low="white", high="darkgrey", guide="colorbar") +
geom_density_2d(data = elk, aes(x=long, y=lat, color = 'red'), alpha = 0.1)

Zooming in on the crossing, we see that the elk take one of three paths between Yellowstone national park and the Elk reserve. The first, and furthest West goes between Jackson Lake on the West and Pilgrim mountain to the East. The middle path follows Pilgrim Creek. The East path follows Pacific Creek.
ggplot() +
geom_spatraster(data = small_land_coord) +
geom_path(data = elk, aes(x=long, y=lat, color = 'red'), alpha = 0.7) +
scale_fill_gradient2(low="white", high="darkgrey", guide="colorbar") +
ylim(43.75 ,44.25) +
xlim(-110.8, -110.2)

We read in the weather data and aggregate the elk data by day.
Among the key factors that we considered to be potentially influential to elk migration was local weather patterns, specifically precipitation (including rain and snow) and average temperature. For this, we analyzed weather station data provided by NOAA National Centers for Environmental Information, utilizing daily weather records from 2006 to 2015 (to correspond with our elk migration data).
Selecting Appropriate Weather Stations
Given that there were numerous weather stations in the Yellowstone/Grand Teton area in Wyoming, several of which were contained within the various elk pathways we analyzed, we decided that the best way to effectively approximate the weather patterns across the entirety migration pathways would be to use the data provided by the four stations in the plots below, which span a wide coverage of the migration areas (shown in red).
# API key
register_stadiamaps(key = '29074900-bb6e-4a71-8f91-454c28190f88',
write = FALSE)
min_lat = all_data |> pull(lat) |> min()
max_lat = all_data |> pull(lat) |> max()
rng_lat = abs(min_lat - max_lat)
lowerleftlat = min_lat
upperrightlat = max_lat
min_long = all_data |> pull(long) |> min()
max_long = all_data |> pull(long) |> max()
rng_long = abs(min_long - max_long)
lowerleftlon = min_long - rng_long
upperrightlon = max_long + rng_long
# Define area by `c(lowerleftlon, lowerleftlat, upperrightlon, upperrightlat)`
myLocation <- c(left = lowerleftlon,
bottom = lowerleftlat,
right = upperrightlon,
top = upperrightlat)
myMap <- get_stadiamap(
bbox=myLocation,
maptype = "stamen_terrain",
crop=FALSE)
ggmap(myMap) +
geom_point(
data = weather_stations,
aes(x = longitude, y = latitude)) +
ggrepel::geom_text_repel(
data = weather_stations,
aes(x = longitude, y = latitude, label = name))+
geom_path(
data = elk,
aes(x=long, y=lat),
alpha = 0.5,
color = "red") +
labs(x = "longitude", y = "latitude")

Once we selected the appropriate weather stations and reduced our weather data set accordingly, we could then begin considering various weather-related research questions. These questions were grouped into two categories: analyzing the weather patterns visible in the study area over the 2006-2015 period, and relating the weather data with the elk migration data to see if there were any visible patterns and/or trends between the two.
Evaluating Weather Patterns and Trends In the
weather dataset, we were mainly concerned with four weather
variables: prcp (precipitation), snow
(snowfall), snwd (snow depth), and tavg
(average temperature). Each of these variables were visualized using the
same graph types. For precipitation, snowfall, and snow depth, the daily
measurements among the four weather stations were first aggregated by
station, year, and month, to show the monthly totals among each station
throughout the 2006-2015 period. These data sets were further condensed
into average monthly totals of precipitation, snowfall and snow depth
among the four stations; this method provided us with an estimated sum
of these three variables covering a wide swath of the study area.
Average temperature was calculated by aggregating daily average
temperature measurements by month and year and taking the mean of these
values.
Starting with precipitation patterns, the plots below show relatively consistent patterns in monthly rainfall throughout the given year, in which the summer months often saw less precipitation out of all other seasons, whereas late winter through spring showed generally higher precipitation. While the exact months of these highs and lows vary per year, the general pattern shown each year is a U-shaped distribution from the start of the year to the end.

Perhaps not surprisingly, snowfall and snow depth were both highest in the winter and early spring months and non-existent in the summer months through early fall. There were slight differences between these two variables such that the highest snow depth measurements were visible for longer (more months out of the year) than when the highest snowfall measurements were visible. These patterns are apparent when comparing the plots side-by-side below. These snow patterns are expected for this area and help validate the reasonability of our weather data.

Finally, when visualizing the monthly average temperatures (see below), we see a consistent pattern each year in which the monthly temperatures rise to their peaks in the summer months (July, specifically) and gradually decline after this peak, and the lowest temperatures occurred in January and December. As was the case with our snow data, this is a surprising pattern for Wyoming, and helps validate our data by ensuring that no unexpected shifts in seasonal patterns occurred over the years.

Analyzing Weather Data and Elk Migration Patterns
After examining the weather data by itself, we then visualized these datasets with the elk migration data to see if there were any visible patterns or trends that emerged. More specifically, we wanted to see if changes in each of the four weather measurements affected the total distance traveled by each elk on a given day, and we did so by plotting the total daily distance traveled as the dependent variable against each of the four weather measurements as the dependent variable.
Following the same order as the previous section, we first plotted the daily distance traveled by the elk against the daily precipitation recordings. In the scatterplot below, we see that the daily distance traveled varied more widely when the daily precipitation measurement was lowest, and that this variation decreased as precipitation levels increased. The smooth-mean line (shown in blue below) across all years appears to have a slight positive slope; this is also reflected in the smooth-mean line graph separated by each elk, which shows that several of the elk appear to increase their distance traveled as precipitation increases. Given that many of the other elk do not follow this same trend, and the smooth-mean lines appear to vary from one another, it is not clear to say whether precipitation increases saw an increase in distance traveled by elk.
<<<<<<< HEAD ======= >>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37Similar to the precipitation plot, the snowfall scatterplot also shows greater variation of daily distance traveled among elk at lower snowfall measurements than at higher snowfall measurements. The smooth-mean trend (blue line) does not appear to have much of a positive or negative trend, and while there are only three elk available with associated snowfall data, the smooth-mean trends for each elk follow a similar pattern in which they traveled longer distances on days with lower snowfall, followed by a sharp decline in distance traveled on days with snowfall starting at between approximately 0.3 (Elk 706) and 1.2 in (Elk 654), followed by a gradual increase in distance traveled on days with snowfall starter at or greater than approximately 1.2 (Elk 706) and 3.5 in (Elk 654).
<<<<<<< HEAD ======= >>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37For snow depth, the overall mean-smooth trend does not show a clear positive or negative trend of distance traveled in response to changes in snow depth. However, when examining the mean-smooth trends of each elk , we see similar patterns among all elk, in which there is a rapid increase in distance traveled on days with lower snow depth (between 0 and ~10-15 in for most elk), followed by a rapid decrease in distance traveled among elk on days with snow depth between ~15 and 35-45 in for most elk, then rapid increases in distance traveled for snow depths beyond this point.
<<<<<<< HEAD ======= >>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37Lastly, after examining the relationship between daily distance traveled and average daily temperature, we can see a slight downward trend in daily distance traveled among elk as average daily temperature increases, as shown by the main blue mean-smooth line that shows a slightly negative overall slope. This negative relationship is visible in the other plot in which daily distance traveled in response to average temperature is shown for each elk. In this plot, we can see that in general, most elk traveled less on a given day as the average daily temperature increased, and some elk showed a slight uptick in daily distance traveled in the middle of the two temperature extremes (between approximately 25 and 40 degrees F for most elk), before a sharp decline in distance traveled on days with temperatures greater than this range.
<<<<<<< HEAD ======= >>>>>>> eb58f9462c0a14b588a2d78c06ed4cc8739daf37WRITE ABOUT THE SOURCE
# raw data, long format
water_quality = read_csv('clean_data/water_quality.csv')
# processed data, aggregated by year-month
water_quality2 = read_csv('clean_data/water_quality2.csv')
water_quality_locations =
water_quality %>%
filter(location_id %in% c('GRTE_SNR01', 'GRTE_SNR02')) %>%
select(location_id, location_name, latitude, longitude) %>%
distinct()
water_exploration =
water_quality2 |>
group_by(
location_id,
characteristic_name
) |>
summarize(
all_time_mean = mean(monthly_mean, na.rm = TRUE),
all_time_var = var(monthly_mean, na.rm = TRUE)) |>
pivot_wider(
names_from = location_id,
values_from = c(all_time_mean, all_time_var)
) |>
mutate(
standardized_difference = abs(all_time_mean_GRTE_SNR01 - all_time_mean_GRTE_SNR02) / sqrt(all_time_var_GRTE_SNR01 + all_time_var_GRTE_SNR02)
)
The site GRTE_SNR01 has higher Chloride, Sodium, Sulfur, Arsenic, and Potassium than GRTE_SNR02.
To do: statistical test
water_exploration |>
arrange(desc(standardized_difference)) |>
dplyr::select(
characteristic_name,
all_time_mean_GRTE_SNR01,
all_time_mean_GRTE_SNR02,
standardized_difference
) |>
head(5)
# A tibble: 5 × 4
characteristic_name all_time_mean_GRTE_S…¹ all_time_mean_GRTE_S…²
<chr> <dbl> <dbl>
1 Chloride mg/l 12.2 3.46
2 Sodium mg/l 22.1 7.39
3 Sulfur, sulfate (SO4) as SO4 mg… 22.5 8.37
4 Arsenic mg/l 0.0300 0.00412
5 Potassium mg/l 3.26 1.63
# ℹ abbreviated names: ¹all_time_mean_GRTE_SNR01, ²all_time_mean_GRTE_SNR02
# ℹ 1 more variable: standardized_difference <dbl>
Examining the data more closely, we see that GRTE_SNR02 has a consistently low level of these chemicals, while GRTE_SNR01 occasionally achieves higher levels of these chemicals.
water_quality2 |>
filter(characteristic_name %in% c(
'Chloride mg/l', 'Sodium mg/l',
'Sulfur, sulfate (SO4) as SO4 mg/l',
'Potassium mg/l', 'Arsenic mg/l')) |>
ggplot(aes(fill = location_id, x = monthly_mean)) +
facet_wrap(.~ characteristic_name, scales = c('free_x'))+
geom_histogram()

Examining water quality by year. There was not any significant increase or decrease of these minerals at either location.
water_quality2 |>
filter(characteristic_name %in% c(
'Chloride mg/l', 'Sodium mg/l',
'Sulfur, sulfate (SO4) as SO4 mg/l',
'Potassium mg/l', 'Arsenic mg/l')) |>
ggplot(
aes(x = factor(year), y = monthly_mean, fill = location_id)
) +
facet_wrap(. ~ characteristic_name, scales = 'free_y') +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

In July and August of 2010, there is a large spike in the amount of Arsenic at the GRTE_SNR01 location.
water_quality2 |>
mutate(year_month = paste0(year, '-', month)) %>%
filter(
characteristic_name == 'Arsenic mg/l',
year %in% c(2009, 2010, 2011)) |>
ggplot(aes(x = year_month, y = monthly_max, color = location_id)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))

Mapping the Elk’s location during that time, we see that they were indeed downstream of the GRTE_SNR01 location, and were likely exposed to elevated levels of Arsenic during this time.
arsenic_measures = names(all_data)[str_detect(names(all_data), 'Arsenic')]
arsenic_exposure =
all_data %>%
filter(
year == 2010,
month %in% c(7, 8)
)
min_lat = arsenic_exposure |> pull(lat) |> min()
max_lat = arsenic_exposure |> pull(lat) |> max()
rng_lat = abs(min_lat - max_lat)
lowerleftlat = min_lat
upperrightlat = max_lat
min_long = arsenic_exposure |> pull(long) |> min()
max_long = arsenic_exposure |> pull(long) |> max()
rng_long = abs(min_long - max_long)
lowerleftlon = min_long - rng_long
upperrightlon = max_long + rng_long
# Define area by `c(lowerleftlon, lowerleftlat, upperrightlon, upperrightlat)`
myLocation <- c(left = lowerleftlon,
bottom = 43.5,
right = upperrightlon,
top = 44.3)
myMap <- get_stadiamap(
bbox=myLocation,
maptype = "stamen_terrain",
crop=FALSE)
elk_ids2 = unique(arsenic_exposure$elk_id)
num_colors = length(elk_ids2) # Ensure we don't exceed palette limit
path_colors =
colorFactor(
palette = RColorBrewer::brewer.pal(num_colors, "Set1"),
domain = elk_ids2)
map = ggmap(myMap)
for(id in elk_ids2){
temp_elk =
arsenic_exposure %>%
filter(elk_id == id)
map =
map +
geom_path(
data = temp_elk,
aes(x=long, y=lat, color = path_colors(id))) +
geom_line(alpha = 0)
}
map =
map +
geom_point(
data = water_quality_locations,
aes(x = longitude, y = latitude))+
geom_text(
data = water_quality_locations,
aes(x = longitude, y = latitude, label = location_id),
nudge_x = 0.18)
map
